import polars as pl
import polars.selectors as cs
print(pl.__version__)1.6.0
8.1 Parsing Unix timestamps
The columns in polars dataframes are statically typed, meaning there is no ambiguity regarding parsing data as integers or as dates. The file we’re using here is a popularity-contest file I found on my system at /var/log/popularity-contest.
Here’s an explanation of how this file works.
# Read it, and remove the last row
popcon = (
pl.read_csv('../data/popularity-contest',
separator=' ',
ignore_errors=True,
new_columns=['atime', 'ctime', 'package-name', 'mru-program', 'tag'])
.filter(~pl.all_horizontal(pl.all().is_null()))
)
popcon.shape(2897, 5)
The colums are the access time, created time, package name, recently used program, and a tag. In this case, polars has parsed the access time and created time as integers instead of datetimes.
popcon.head()| atime | ctime | package-name | mru-program | tag |
|---|---|---|---|---|
| i64 | i64 | str | str | str |
| 1387295797 | 1367633260 | "perl-base" | "/usr/bin/perl" | null |
| 1387295796 | 1354370480 | "login" | "/bin/su" | null |
| 1387295743 | 1354341275 | "libtalloc2" | "/usr/lib/x86_64-linux-gnu/libt… | null |
| 1387295743 | 1387224204 | "libwbclient0" | "/usr/lib/x86_64-linux-gnu/libw… | "<RECENT-CTIME>" |
| 1387295742 | 1354341253 | "libselinux1" | "/lib/x86_64-linux-gnu/libselin… | null |
We can explicitly convert the integers to datetimes using the from_epoch function:
popcon = popcon.with_columns(
pl.from_epoch('atime', time_unit='s'),
pl.from_epoch('ctime') #time_unit='s' is default
)If we look at the dtype now, it’s pl.Datetime.
popcon['atime'].dtypeDatetime(time_unit='us', time_zone=None)
So now we can look at our atime and ctime as dates!
popcon.head()| atime | ctime | package-name | mru-program | tag |
|---|---|---|---|---|
| datetime[μs] | datetime[μs] | str | str | str |
| 2013-12-17 15:56:37 | 2013-05-04 02:07:40 | "perl-base" | "/usr/bin/perl" | null |
| 2013-12-17 15:56:36 | 2012-12-01 14:01:20 | "login" | "/bin/su" | null |
| 2013-12-17 15:55:43 | 2012-12-01 05:54:35 | "libtalloc2" | "/usr/lib/x86_64-linux-gnu/libt… | null |
| 2013-12-17 15:55:43 | 2013-12-16 20:03:24 | "libwbclient0" | "/usr/lib/x86_64-linux-gnu/libw… | "<RECENT-CTIME>" |
| 2013-12-17 15:55:42 | 2012-12-01 05:54:13 | "libselinux1" | "/lib/x86_64-linux-gnu/libselin… | null |
Now suppose we want to look at all packages that aren’t libraries. First, I want to get rid of everything with timestamp 0.
print("before filter")
display(popcon.bottom_k(3, by='atime'))
popcon = popcon.filter(
pl.col('atime') > pl.datetime(1970, 1, 1)
)
print("after filter")
display(popcon.bottom_k(3, by='atime'))before filter
| atime | ctime | package-name | mru-program | tag |
|---|---|---|---|---|
| datetime[μs] | datetime[μs] | str | str | str |
| 1970-01-01 00:00:00 | 1970-01-01 00:00:00 | "librsync1" | "<NOFILES>" | null |
| 1970-01-01 00:00:00 | 1970-01-01 00:00:00 | "libindicator-messages-status-p… | "<NOFILES>" | null |
| 1970-01-01 00:00:00 | 1970-01-01 00:00:00 | "libxfconf-0-2" | "<NOFILES>" | null |
after filter
| atime | ctime | package-name | mru-program | tag |
|---|---|---|---|---|
| datetime[μs] | datetime[μs] | str | str | str |
| 2008-11-20 14:38:20 | 2012-12-01 05:54:57 | "libfile-copy-recursive-perl" | "/usr/share/perl5/File/Copy/Rec… | "<OLD>" |
| 2010-02-22 14:59:21 | 2012-12-01 05:54:14 | "libfribidi0" | "/usr/bin/fribidi" | "<OLD>" |
| 2010-03-06 14:44:18 | 2012-12-01 05:54:37 | "laptop-detect" | "/usr/sbin/laptop-detect" | "<OLD>" |
Now we can use polars’ filter and str look at rows where the package name doesn’t contain ‘lib’.
nonlibraries = popcon.filter(
~pl.col('package-name').str.contains('lib')
)
nonlibraries.top_k(10, by='ctime')| atime | ctime | package-name | mru-program | tag |
|---|---|---|---|---|
| datetime[μs] | datetime[μs] | str | str | str |
| 2013-12-17 04:55:39 | 2013-12-17 04:55:42 | "ddd" | "/usr/bin/ddd" | "<RECENT-CTIME>" |
| 2013-12-16 20:03:20 | 2013-12-16 20:05:13 | "nodejs" | "/usr/bin/npm" | "<RECENT-CTIME>" |
| 2013-12-16 20:03:20 | 2013-12-16 20:05:04 | "switchboard-plug-keyboard" | "/usr/lib/plugs/pantheon/keyboa… | "<RECENT-CTIME>" |
| 2013-12-16 20:03:20 | 2013-12-16 20:05:04 | "thunderbird-locale-en" | "/usr/lib/thunderbird-addons/ex… | "<RECENT-CTIME>" |
| 2013-12-16 20:08:27 | 2013-12-16 20:05:03 | "software-center" | "/usr/sbin/update-software-cent… | "<RECENT-CTIME>" |
| 2013-12-16 20:03:20 | 2013-12-16 20:05:00 | "samba-common-bin" | "/usr/bin/net.samba3" | "<RECENT-CTIME>" |
| 2013-12-16 20:08:25 | 2013-12-16 20:04:59 | "postgresql-client-9.1" | "/usr/lib/postgresql/9.1/bin/ps… | "<RECENT-CTIME>" |
| 2013-12-16 20:08:23 | 2013-12-16 20:04:58 | "postgresql-9.1" | "/usr/lib/postgresql/9.1/bin/po… | "<RECENT-CTIME>" |
| 2013-12-16 20:03:20 | 2013-12-16 20:04:55 | "php5-dev" | "/usr/include/php5/main/snprint… | "<RECENT-CTIME>" |
| 2013-12-16 20:03:20 | 2013-12-16 20:04:54 | "php-pear" | "/usr/share/php/XML/Util.php" | "<RECENT-CTIME>" |